# Read csv from project dir
life_df <- read_csv("C:/Users/e005108/Downloads/datasets_12603_17232_Life Expectancy Data.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## Country = col_character(),
## Status = col_character()
## )
## See spec(...) for full column specifications.
#life_df <- read_csv(file.choose())
#life_df <- read_csv("data/LifeExpectancyData_w_regions.csv")
life_df <-
life_df %>%
filter(Year == 2014)
head(life_df)
## # A tibble: 6 x 22
## Country Year Status `Life expectanc~ `Adult Mortalit~ `infant deaths` Alcohol
## <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Afghan~ 2014 Devel~ 59.9 271 64 0.01
## 2 Albania 2014 Devel~ 77.5 8 0 4.51
## 3 Algeria 2014 Devel~ 75.4 11 21 0.01
## 4 Angola 2014 Devel~ 51.7 348 67 8.33
## 5 Antigu~ 2014 Devel~ 76.2 131 0 8.56
## 6 Argent~ 2014 Devel~ 76.2 118 8 7.93
## # ... with 15 more variables: `percentage expenditure` <dbl>, `Hepatitis
## # B` <dbl>, Measles <dbl>, BMI <dbl>, `under-five deaths` <dbl>, Polio <dbl>,
## # `Total expenditure` <dbl>, Diphtheria <dbl>, `HIV/AIDS` <dbl>, GDP <dbl>,
## # Population <dbl>, `thinness 1-19 years` <dbl>, `thinness 5-9 years` <dbl>,
## # `Income composition of resources` <dbl>, Schooling <dbl>
for (i in 1:length(names(life_df))) {
ifelse(
grep(" ",names(life_df)[i]) == TRUE,
names(life_df)[i] <- gsub(" ", "_", names(life_df)[i]),
next
)
}
names(life_df)
## [1] "Country" "Year"
## [3] "Status" "Life_expectancy"
## [5] "Adult_Mortality" "infant_deaths"
## [7] "Alcohol" "percentage_expenditure"
## [9] "Hepatitis_B" "Measles"
## [11] "BMI" "under-five_deaths"
## [13] "Polio" "Total_expenditure"
## [15] "Diphtheria" "HIV/AIDS"
## [17] "GDP" "Population"
## [19] "thinness__1-19_years" "thinness_5-9_years"
## [21] "Income_composition_of_resources" "Schooling"
summary(life_df)
## Country Year Status Life_expectancy
## Length:183 Min. :2014 Length:183 Min. :48.10
## Class :character 1st Qu.:2014 Class :character 1st Qu.:65.60
## Mode :character Median :2014 Mode :character Median :73.60
## Mean :2014 Mean :71.54
## 3rd Qu.:2014 3rd Qu.:76.85
## Max. :2014 Max. :89.00
##
## Adult_Mortality infant_deaths Alcohol percentage_expenditure
## Min. : 1.0 Min. : 0.00 Min. : 0.010 Min. : 0.00
## 1st Qu.: 66.0 1st Qu.: 0.00 1st Qu.: 0.010 1st Qu.: 11.06
## Median :135.0 Median : 2.00 Median : 0.320 Median : 151.10
## Mean :148.7 Mean : 24.56 Mean : 3.271 Mean : 1001.91
## 3rd Qu.:216.5 3rd Qu.: 18.00 3rd Qu.: 6.700 3rd Qu.: 703.21
## Max. :522.0 Max. :957.00 Max. :15.190 Max. :19479.91
## NA's :1
## Hepatitis_B Measles BMI under-five_deaths
## Min. : 2.00 Min. : 0 Min. : 2.00 Min. : 0.00
## 1st Qu.:79.00 1st Qu.: 0 1st Qu.:23.20 1st Qu.: 0.00
## Median :93.00 Median : 13 Median :47.40 Median : 3.00
## Mean :83.12 Mean : 1831 Mean :41.03 Mean : 32.89
## 3rd Qu.:97.00 3rd Qu.: 316 3rd Qu.:59.80 3rd Qu.: 22.00
## Max. :99.00 Max. :79563 Max. :77.10 Max. :1200.00
## NA's :10 NA's :2
## Polio Total_expenditure Diphtheria HIV/AIDS
## Min. : 8.00 Min. : 1.210 Min. : 2.00 Min. :0.100
## 1st Qu.:80.00 1st Qu.: 4.480 1st Qu.:83.00 1st Qu.:0.100
## Median :94.00 Median : 5.840 Median :94.00 Median :0.100
## Mean :84.73 Mean : 6.201 Mean :84.08 Mean :0.682
## 3rd Qu.:97.00 3rd Qu.: 7.740 3rd Qu.:97.00 3rd Qu.:0.400
## Max. :99.00 Max. :17.140 Max. :99.00 Max. :9.400
## NA's :2
## GDP Population thinness__1-19_years
## Min. : 12.28 Min. :4.100e+01 Min. : 0.100
## 1st Qu.: 617.99 1st Qu.:2.869e+05 1st Qu.: 1.500
## Median : 3154.51 Median :1.568e+06 Median : 3.300
## Mean : 10015.57 Mean :2.106e+07 Mean : 4.533
## 3rd Qu.: 8239.95 3rd Qu.:8.080e+06 3rd Qu.: 6.600
## Max. :119172.74 Max. :1.294e+09 Max. :26.800
## NA's :28 NA's :41 NA's :2
## thinness_5-9_years Income_composition_of_resources Schooling
## Min. : 0.100 Min. :0.3450 Min. : 4.90
## 1st Qu.: 1.500 1st Qu.:0.5700 1st Qu.:10.80
## Median : 3.400 Median :0.7220 Median :13.00
## Mean : 4.676 Mean :0.6884 Mean :12.89
## 3rd Qu.: 6.600 3rd Qu.:0.7960 3rd Qu.:14.90
## Max. :27.400 Max. :0.9450 Max. :20.40
## NA's :2 NA's :10 NA's :10
gg_miss_var(life_df)
life_df %>%
filter(
Status == "Developed"
) %>%
ggplot() +
geom_col(
aes(
x = reorder(Country, Life_expectancy),
y = Life_expectancy,
fill = Status
)
) +
coord_flip() +
theme(
axis.text.y = element_text(size=6),
)
life_df %>%
filter(
Status == "Developing"
) %>%
ggplot() +
geom_col(
aes(
x = reorder(Country, Life_expectancy),
y = Life_expectancy,
fill = Status
)
) +
coord_flip() +
theme(
axis.text.y = element_text(size=2),
)
life <- life_df %>%
drop_na()
summary(life)
## Country Year Status Life_expectancy
## Length:131 Min. :2014 Length:131 Min. :48.10
## Class :character 1st Qu.:2014 Class :character 1st Qu.:64.65
## Mode :character Median :2014 Mode :character Median :72.00
## Mean :2014 Mean :70.52
## 3rd Qu.:2014 3rd Qu.:75.80
## Max. :2014 Max. :89.00
## Adult_Mortality infant_deaths Alcohol percentage_expenditure
## Min. : 2.0 Min. : 0.00 Min. : 0.010 Min. : 0.443
## 1st Qu.: 74.5 1st Qu.: 0.00 1st Qu.: 0.010 1st Qu.: 48.311
## Median :144.0 Median : 3.00 Median : 0.010 Median : 198.734
## Mean :160.4 Mean : 28.56 Mean : 3.061 Mean : 850.874
## 3rd Qu.:225.0 3rd Qu.: 20.00 3rd Qu.: 6.305 3rd Qu.: 718.324
## Max. :522.0 Max. :957.00 Max. :15.190 Max. :16255.162
## Hepatitis_B Measles BMI under-five_deaths
## Min. : 2.00 Min. : 0.0 Min. : 2.00 Min. : 0.00
## 1st Qu.:78.00 1st Qu.: 0.0 1st Qu.:22.85 1st Qu.: 1.00
## Median :91.00 Median : 10.0 Median :45.90 Median : 3.00
## Mean :81.71 Mean : 2042.9 Mean :40.48 Mean : 38.24
## 3rd Qu.:96.00 3rd Qu.: 289.5 3rd Qu.:59.45 3rd Qu.: 25.50
## Max. :99.00 Max. :79563.0 Max. :77.10 Max. :1200.00
## Polio Total_expenditure Diphtheria HIV/AIDS
## Min. : 8.0 Min. : 1.210 Min. : 2.00 Min. :0.1000
## 1st Qu.:78.0 1st Qu.: 4.485 1st Qu.:80.00 1st Qu.:0.1000
## Median :92.0 Median : 5.820 Median :92.00 Median :0.1000
## Mean :83.5 Mean : 6.107 Mean :83.89 Mean :0.8099
## 3rd Qu.:97.0 3rd Qu.: 7.630 3rd Qu.:97.00 3rd Qu.:0.5000
## Max. :99.0 Max. :13.730 Max. :99.00 Max. :9.4000
## GDP Population thinness__1-19_years
## Min. : 12.28 Min. :4.100e+01 Min. : 0.100
## 1st Qu.: 554.92 1st Qu.:2.876e+05 1st Qu.: 1.500
## Median : 2522.80 Median :1.563e+06 Median : 3.300
## Mean : 7256.85 Mean :2.227e+07 Mean : 4.648
## 3rd Qu.: 7438.05 3rd Qu.:8.059e+06 3rd Qu.: 6.650
## Max. :119172.74 Max. :1.294e+09 Max. :26.800
## thinness_5-9_years Income_composition_of_resources Schooling
## Min. : 0.100 Min. :0.3450 Min. : 5.30
## 1st Qu.: 1.550 1st Qu.:0.5440 1st Qu.:10.75
## Median : 3.500 Median :0.6970 Median :12.70
## Mean : 4.886 Mean :0.6697 Mean :12.68
## 3rd Qu.: 6.800 3rd Qu.:0.7790 3rd Qu.:14.70
## Max. :27.400 Max. :0.9360 Max. :20.40
range(life$Life_expectancy)
## [1] 48.1 89.0
22 Variables, 20 of them are Numerical, and 2 of them are Categorical. Variables we should drop: Country, Year Hepatitis.B has the Min Value and 1st Quartile difference of 76 which is too high (Factor?) Polio has the Min value and 1st QUartile difference of 70 which is too high (Factor?) Diphtheria has the min value and 1st quartile difference of 78 which is too high (Factor?)
According to the World Health Organization in 2018 they said that 86% of children in the world are receiving immunizations protecting them from these diseases. Source: https://www.chop.edu/centers-programs/vaccine-education-center/global-immunization/diseases-and-vaccines-world-view Let us use 86% as the benchmark to turn these columns into factors.
life_new <- life %>%
select(-Country, -Year) %>%
mutate(Hepatitis_B = ifelse(Hepatitis_B < 86, "<86% Immunized", ">=86% Immunized"),
Polio = ifelse(Polio < 86, "<86% Immunized", ">=86% Immunized"),
Diphtheria = ifelse(Diphtheria < 86, "<86% Immunized", ">=86% Immunized"),
Hepatitis_B = as.factor(Hepatitis_B),
Polio = as.factor(Polio),
Diphtheria = as.factor(Diphtheria))
str(life_new)
## tibble [131 x 20] (S3: tbl_df/tbl/data.frame)
## $ Status : chr [1:131] "Developing" "Developing" "Developing" "Developing" ...
## $ Life_expectancy : num [1:131] 59.9 77.5 75.4 51.7 76.2 74.6 82.7 81.4 72.5 71.4 ...
## $ Adult_Mortality : num [1:131] 271 8 11 348 118 12 6 66 119 132 ...
## $ infant_deaths : num [1:131] 64 0 21 67 8 1 1 0 5 98 ...
## $ Alcohol : num [1:131] 0.01 4.51 0.01 8.33 7.93 ...
## $ percentage_expenditure : num [1:131] 73.5 428.7 54.2 24 847.4 ...
## $ Hepatitis_B : Factor w/ 2 levels "<86% Immunized",..: 1 2 2 1 2 2 2 2 2 2 ...
## $ Measles : num [1:131] 492 0 0 11699 1 ...
## $ BMI : num [1:131] 18.6 57.2 58.4 22.7 62.2 54.1 66.1 57.1 51.5 17.7 ...
## $ under-five_deaths : num [1:131] 86 1 24 101 9 1 1 0 6 121 ...
## $ Polio : Factor w/ 2 levels "<86% Immunized",..: 1 2 2 1 2 2 2 2 2 2 ...
## $ Total_expenditure : num [1:131] 8.18 5.88 7.21 3.31 4.79 ...
## $ Diphtheria : Factor w/ 2 levels "<86% Immunized",..: 1 2 2 1 2 2 2 2 2 2 ...
## $ HIV/AIDS : num [1:131] 0.1 0.1 0.1 2 0.1 0.1 0.1 0.1 0.1 0.1 ...
## $ GDP : num [1:131] 613 4576 548 479 12245 ...
## $ Population : num [1:131] 327582 288914 39113313 2692466 42981515 ...
## $ thinness__1-19_years : num [1:131] 17.5 1.2 6 8.5 1 2.1 0.6 1.8 2.8 18.1 ...
## $ thinness_5-9_years : num [1:131] 17.5 1.3 5.8 8.3 0.9 2.1 0.6 2 2.9 18.6 ...
## $ Income_composition_of_resources: num [1:131] 0.476 0.761 0.741 0.527 0.825 0.739 0.936 0.892 0.752 0.57 ...
## $ Schooling : num [1:131] 10 14.2 14.4 11.4 17.3 12.7 20.4 15.9 12.2 10 ...
## - attr(*, "spec")=
## .. cols(
## .. Country = col_character(),
## .. Year = col_double(),
## .. Status = col_character(),
## .. `Life expectancy` = col_double(),
## .. `Adult Mortality` = col_double(),
## .. `infant deaths` = col_double(),
## .. Alcohol = col_double(),
## .. `percentage expenditure` = col_double(),
## .. `Hepatitis B` = col_double(),
## .. Measles = col_double(),
## .. BMI = col_double(),
## .. `under-five deaths` = col_double(),
## .. Polio = col_double(),
## .. `Total expenditure` = col_double(),
## .. Diphtheria = col_double(),
## .. `HIV/AIDS` = col_double(),
## .. GDP = col_double(),
## .. Population = col_double(),
## .. `thinness 1-19 years` = col_double(),
## .. `thinness 5-9 years` = col_double(),
## .. `Income composition of resources` = col_double(),
## .. Schooling = col_double()
## .. )
Check the correlation of the numerical variables
life_numerical <- life_new %>%
select_if(is.numeric)
ggcorr(life_numerical,
label = T,
label_size = 2,
label_round = 2,
hjust = 1,
size = 3,
color = "black",
layout.exp = 5,
low = "forestgreen",
mid = "gray95",
high = "darkorange",
name = "Correlation")
Life_expectancy has a strong negative correlation with Adult_Mortality which is understandable since if mortality rate in adults is high, Life exepctancy would be lower. Let us compare our Correlation Matrix with a VIF model to see which columns we should drop.
life_df_fctr2 <- life_new #removing the country name, indicator vars
life_fctr2model <- lm(Life_expectancy~.,data=life_df_fctr2) # . means all variable not mpg
vif(life_fctr2model)
## Status Adult_Mortality
## 1.690003 2.671429
## infant_deaths Alcohol
## 404.004911 2.064448
## percentage_expenditure Hepatitis_B
## 11.950842 7.194311
## Measles BMI
## 2.867046 2.193348
## `under-five_deaths` Polio
## 324.203388 5.264166
## Total_expenditure Diphtheria
## 1.309070 11.696480
## `HIV/AIDS` GDP
## 1.880140 12.482209
## Population `thinness__1-19_years`
## 8.013231 12.991545
## `thinness_5-9_years` Income_composition_of_resources
## 13.140587 11.347212
## Schooling
## 7.468005
Life Expenctancy has a strong positive correlation with Schooling and Income_composition_of_resources but Schooling and Income_composition_of_resources have a high correlation with one another. Income_composition_of_resources has a VIF of 11.35 (vs 7.47 for Schooling) so let’s drop that.
Infant_deaths and under.five_deaths have a correlation of 1 (100%) which would tell us that there is multicollinearity between them. Both have extremely high VIFs, so we should drop one of the variables. and I believe under 5 deaths should be dropped since it covers more ages and includes infant deaths within it.
Percentage_expenditure vs GDP is also highly correlated with one another with both values having high VIFs. GDP is a more well known datapoint and has a higher correlation with Life_expectancy, so let’s remove percentage_expenditure.
thinness_5-9_years has a strong correlation with thinness_1-19_years and both have high VIFs. Let’s drop thinness_5-9_years since thinness_1-19_years should have years 5-9 within it as well.
#removes Infant_deaths, Income_composition_of_resources, thinness_5-9_years, and percentage_expenditure.
life_new<- life_new[-c(6,10,18:19)]
life_df_fctr2 <- life_new
life_fctr2model <- lm(Life_expectancy~.,data=life_df_fctr2)
vif(life_fctr2model)
## Status Adult_Mortality infant_deaths
## 1.676668 2.326290 4.614388
## Alcohol Hepatitis_B Measles
## 1.878859 6.867340 2.233165
## BMI Polio Total_expenditure
## 1.835098 5.164795 1.176686
## Diphtheria `HIV/AIDS` GDP
## 11.267731 1.842168 1.479840
## Population `thinness__1-19_years` Schooling
## 4.474000 2.234582 3.327405
The VIFs now look good for the numerical variables.
##Check the distribution of the categorical variables
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 3
## Status count percentage
## <chr> <int> <chr>
## 1 Developed 19 14.5%
## 2 Developing 112 85.5%
According to the boxplots, the distribution of higher life_expectancy resides in the developed countries, with the plot even showing that the median for developed countries (2nd Quartile) beind skewed higher towards the 3rd quartile
We want to know if there is any significant difference between the average life expectancy in Developed and Developing countries.
summary(aov(Life_expectancy ~ Status, data = life_new))
## Df Sum Sq Mean Sq F value Pr(>F)
## Status 1 2453 2453.1 44.12 7.83e-10 ***
## Residuals 129 7173 55.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The ANOVA test tells us that there is a significant difference between the life expectancy of Developed countries and the life expectancy of Developing countries.
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 3
## Hepatitis_B count percentage
## <fct> <int> <chr>
## 1 <86% Immunized 44 33.59%
## 2 >=86% Immunized 87 66.41%
More than 1/3 of countries have less than 86% immunized for Hepatitis B. The Life Expectancy of the countries with greater than or equal to 86% Immunized is higher than the countries which have less that 86% Immunized. Note that the median (2nd Quartile) for >=86% immunized is skewed upwards towards the 3rd quartile.
We want to know if there is any significant difference between the average life expectancy in Developed and Developing countries.
summary(aov(Life_expectancy ~ Hepatitis_B, data = life_new))
## Df Sum Sq Mean Sq F value Pr(>F)
## Hepatitis_B 1 1088 1087.6 16.43 8.67e-05 ***
## Residuals 129 8539 66.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The ANOVA test tells us that there is a significant difference between the life expectancy of countries that immmunized >=86% and the life expectancy of countries that immunized less than 86% for Hep B.
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 3
## Polio count percentage
## <fct> <int> <chr>
## 1 <86% Immunized 46 35.11%
## 2 >=86% Immunized 85 64.89%
For Polio Immunizations, higher life expectancy resides with the countries that have >=86% Immunized. Note that the median (2nd Quartile) for >=86% immunized is skewed upwards towards the 3rd quartile.
summary(aov(Life_expectancy ~ Polio, data = life_new))
## Df Sum Sq Mean Sq F value Pr(>F)
## Polio 1 2295 2294.8 40.38 3.3e-09 ***
## Residuals 129 7332 56.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
According to the ANOVA test, there is a significant difference between the life expectancy of countries that immmunized >=86% and the life expectancy of countries that immunized less than 86% for Polio.
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 3
## Diphtheria count percentage
## <fct> <int> <chr>
## 1 <86% Immunized 39 29.77%
## 2 >=86% Immunized 92 70.23%
For Diphtheria Immunizations, higher life expectancy resides with the countries that have >=86% Immunized. Note that the median (2nd Quartile) for >=86% immunized is skewed upwards towards the 3rd quartile.
summary(aov(Life_expectancy ~ Diphtheria, data = life_new))
## Df Sum Sq Mean Sq F value Pr(>F)
## Diphtheria 1 1934 1934.3 32.44 7.91e-08 ***
## Residuals 129 7692 59.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
According to the ANOVA test, there is a significant difference between the life expectancy of countries that immmunized >=86% and the life expectancy of countries that immunized less than 86% for Diphtheria.
life_plot5 <- ggplot(life_new) +
geom_mosaic(aes(x = product(Status), fill=Hepatitis_B)) +
labs(x = NULL, y = NULL) +
scale_fill_manual(values=c("forestgreen", "cadetblue")) +
labs(x = "Status", y = "Hepatitis_B Immunizations") +
theme(legend.position = "none")
ggplotly(life_plot5)
chisq.test(table(life_new$Status, life_new$Hepatitis_B))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(life_new$Status, life_new$Hepatitis_B)
## X-squared = 4.1582, df = 1, p-value = 0.04143
A larger percentage of Developed countries have >=86% of their people immunized for Hepatitis_B. The Chi-Squared test tells us that Developed and Developing Countries have different coverage of Hepatitis_B immunizations.
life_plot6 <- ggplot(life_new) +
geom_mosaic(aes(x = product(Status), fill=Polio)) +
labs(x = NULL, y = NULL) +
scale_fill_manual(values=c("forestgreen", "cadetblue")) +
labs(x = "Status", y = "Polio Immunizations") +
theme(legend.position = "none")
ggplotly(life_plot6)
chisq.test(table(life_new$Status, life_new$Polio))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(life_new$Status, life_new$Polio)
## X-squared = 10.292, df = 1, p-value = 0.001336
ALL of the Developed countries have >=86% of their people immunized for Polio. The Chi-Squared test tells us that Developed and Developing Countries have different coverage of Polio immunizations.
life_plot7 <- ggplot(life_new) +
geom_mosaic(aes(x = product(Status), fill=Diphtheria)) +
labs(x = NULL, y = NULL) +
scale_fill_manual(values=c("forestgreen", "cadetblue")) +
labs(x = "Status", y = "Diphtheria Immunizations") +
theme(legend.position = "none")
ggplotly(life_plot7)
chisq.test(table(life_new$Status, life_new$Diphtheria))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(life_new$Status, life_new$Diphtheria)
## X-squared = 7.8288, df = 1, p-value = 0.005142
ALL of the Developed countries have >=86% of their people immunized for Diphtheria The Chi-Squared test tells us that Developed and Developing Countries have different coverage of Diphtheria immunizations.
These numbers are very similar to Polio numbers, there could be evidence that Polio and Diphteria immunizations are done at the same time therefore one of them could be dropped. Let us see in the next section.
life_lm <- lm(formula = Life_expectancy ~., data = life_new)
summary(life_lm)
##
## Call:
## lm(formula = Life_expectancy ~ ., data = life_new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.2729 -1.7178 0.1095 2.0201 9.3597
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.071e+01 3.260e+00 18.624 < 2e-16 ***
## StatusDeveloping -1.507e+00 1.155e+00 -1.305 0.19454
## Adult_Mortality -2.608e-02 4.366e-03 -5.974 2.65e-08 ***
## infant_deaths -3.499e-03 6.767e-03 -0.517 0.60610
## Alcohol 1.174e-01 1.057e-01 1.111 0.26903
## Hepatitis_B>=86% Immunized -2.035e+00 1.743e+00 -1.168 0.24541
## Measles 1.618e-05 4.787e-05 0.338 0.73596
## BMI 3.225e-02 2.060e-02 1.565 0.12026
## Polio>=86% Immunized -7.357e-01 1.495e+00 -0.492 0.62372
## Total_expenditure 2.527e-01 1.350e-01 1.871 0.06385 .
## Diphtheria>=86% Immunized 4.263e+00 2.306e+00 1.849 0.06704 .
## `HIV/AIDS` -1.049e+00 2.740e-01 -3.829 0.00021 ***
## GDP 4.030e-05 2.602e-05 1.549 0.12418
## Population 4.617e-09 5.715e-09 0.808 0.42086
## `thinness__1-19_years` -1.017e-01 1.066e-01 -0.953 0.34236
## Schooling 9.389e-01 2.091e-01 4.489 1.71e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.595 on 115 degrees of freedom
## Multiple R-squared: 0.8456, Adjusted R-squared: 0.8255
## F-statistic: 41.99 on 15 and 115 DF, p-value: < 2.2e-16
Coefficients that are negative: StatusDeveloping, Adult_Mortality, infant_deaths, Hepatitis_B>=86% Immunized, Polio>=86% Immunized, HIV/AIDS, and thinness__1-19_years. This tells us that everytime we add one of these variables we expect Life_expectancy to decrease. Diphtheria>=86% Immunized is expected to raise Life_expectancy while Hepatitis_B>=86% Immunized and Polio>=86% Immunized are exoected to decrease Life_expectancy. Adult_Mortality, HIV/AIDS, and Schooling are seen as the best predictors of Life_expectancy according to the model.
According to the Adj. R-Squared, 82.55% of the variation in Life_expectancy can be determined by the model’s inputed values. This is a very good value for the model.
life_step <- lm(formula = Life_expectancy ~., data = life_new)
life_step0 <- lm(formula = Life_expectancy ~1, data = life_new)
life_backward <- step(life_step, direction = "backward")
## Start: AIC=350.18
## Life_expectancy ~ Status + Adult_Mortality + infant_deaths +
## Alcohol + Hepatitis_B + Measles + BMI + Polio + Total_expenditure +
## Diphtheria + `HIV/AIDS` + GDP + Population + `thinness__1-19_years` +
## Schooling
##
## Df Sum of Sq RSS AIC
## - Measles 1 1.48 1487.8 348.31
## - Polio 1 3.13 1489.5 348.46
## - infant_deaths 1 3.46 1489.8 348.49
## - Population 1 8.43 1494.8 348.92
## - `thinness__1-19_years` 1 11.75 1498.1 349.21
## - Alcohol 1 15.94 1502.3 349.58
## - Hepatitis_B 1 17.62 1504.0 349.73
## - Status 1 22.01 1508.4 350.11
## <none> 1486.3 350.18
## - GDP 1 31.00 1517.3 350.89
## - BMI 1 31.67 1518.0 350.95
## - Diphtheria 1 44.18 1530.5 352.02
## - Total_expenditure 1 45.26 1531.6 352.11
## - `HIV/AIDS` 1 189.47 1675.8 363.90
## - Schooling 1 260.51 1746.9 369.34
## - Adult_Mortality 1 461.20 1947.5 383.59
##
## Step: AIC=348.31
## Life_expectancy ~ Status + Adult_Mortality + infant_deaths +
## Alcohol + Hepatitis_B + BMI + Polio + Total_expenditure +
## Diphtheria + `HIV/AIDS` + GDP + Population + `thinness__1-19_years` +
## Schooling
##
## Df Sum of Sq RSS AIC
## - infant_deaths 1 2.43 1490.2 346.53
## - Polio 1 3.13 1491.0 346.59
## - Population 1 10.98 1498.8 347.28
## - `thinness__1-19_years` 1 13.58 1501.4 347.50
## - Hepatitis_B 1 17.16 1505.0 347.82
## - Alcohol 1 17.22 1505.0 347.82
## - Status 1 21.26 1509.1 348.17
## <none> 1487.8 348.31
## - BMI 1 30.35 1518.2 348.96
## - GDP 1 30.38 1518.2 348.96
## - Diphtheria 1 43.59 1531.4 350.10
## - Total_expenditure 1 44.71 1532.5 350.19
## - `HIV/AIDS` 1 195.04 1682.9 362.45
## - Schooling 1 261.98 1749.8 367.56
## - Adult_Mortality 1 460.12 1948.0 381.61
##
## Step: AIC=346.53
## Life_expectancy ~ Status + Adult_Mortality + Alcohol + Hepatitis_B +
## BMI + Polio + Total_expenditure + Diphtheria + `HIV/AIDS` +
## GDP + Population + `thinness__1-19_years` + Schooling
##
## Df Sum of Sq RSS AIC
## - Polio 1 3.40 1493.7 344.83
## - Population 1 12.10 1502.4 345.59
## - `thinness__1-19_years` 1 14.44 1504.7 345.79
## - Hepatitis_B 1 17.05 1507.3 346.02
## - Alcohol 1 17.62 1507.9 346.07
## - Status 1 20.18 1510.4 346.29
## <none> 1490.2 346.53
## - GDP 1 30.29 1520.5 347.16
## - BMI 1 30.80 1521.0 347.21
## - Diphtheria 1 45.10 1535.3 348.43
## - Total_expenditure 1 46.63 1536.9 348.56
## - `HIV/AIDS` 1 198.27 1688.5 360.89
## - Schooling 1 268.21 1758.5 366.21
## - Adult_Mortality 1 464.59 1954.8 380.08
##
## Step: AIC=344.83
## Life_expectancy ~ Status + Adult_Mortality + Alcohol + Hepatitis_B +
## BMI + Total_expenditure + Diphtheria + `HIV/AIDS` + GDP +
## Population + `thinness__1-19_years` + Schooling
##
## Df Sum of Sq RSS AIC
## - Population 1 12.68 1506.3 343.93
## - `thinness__1-19_years` 1 15.26 1508.9 344.16
## - Alcohol 1 16.31 1510.0 344.25
## - Hepatitis_B 1 16.77 1510.4 344.29
## - Status 1 19.58 1513.2 344.53
## <none> 1493.7 344.83
## - GDP 1 30.39 1524.0 345.46
## - BMI 1 31.78 1525.4 345.58
## - Total_expenditure 1 44.86 1538.5 346.70
## - Diphtheria 1 47.92 1541.6 346.96
## - `HIV/AIDS` 1 195.02 1688.7 358.90
## - Schooling 1 268.41 1762.1 364.47
## - Adult_Mortality 1 461.76 1955.4 378.11
##
## Step: AIC=343.93
## Life_expectancy ~ Status + Adult_Mortality + Alcohol + Hepatitis_B +
## BMI + Total_expenditure + Diphtheria + `HIV/AIDS` + GDP +
## `thinness__1-19_years` + Schooling
##
## Df Sum of Sq RSS AIC
## - `thinness__1-19_years` 1 5.83 1512.2 342.44
## - Status 1 18.44 1524.8 343.53
## - Hepatitis_B 1 18.48 1524.8 343.53
## - Alcohol 1 19.84 1526.2 343.65
## <none> 1506.3 343.93
## - GDP 1 30.62 1537.0 344.57
## - BMI 1 34.43 1540.8 344.89
## - Total_expenditure 1 43.85 1550.2 345.69
## - Diphtheria 1 44.88 1551.2 345.78
## - `HIV/AIDS` 1 198.24 1704.6 358.13
## - Schooling 1 302.29 1808.6 365.89
## - Adult_Mortality 1 459.48 1965.8 376.81
##
## Step: AIC=342.44
## Life_expectancy ~ Status + Adult_Mortality + Alcohol + Hepatitis_B +
## BMI + Total_expenditure + Diphtheria + `HIV/AIDS` + GDP +
## Schooling
##
## Df Sum of Sq RSS AIC
## - Hepatitis_B 1 19.05 1531.2 342.08
## - Status 1 19.41 1531.6 342.11
## - Alcohol 1 20.94 1533.1 342.24
## <none> 1512.2 342.44
## - GDP 1 32.62 1544.8 343.23
## - Diphtheria 1 44.24 1556.4 344.22
## - Total_expenditure 1 46.59 1558.8 344.41
## - BMI 1 52.90 1565.1 344.94
## - `HIV/AIDS` 1 193.78 1705.9 356.23
## - Schooling 1 326.72 1838.9 366.06
## - Adult_Mortality 1 459.47 1971.6 375.20
##
## Step: AIC=342.08
## Life_expectancy ~ Status + Adult_Mortality + Alcohol + BMI +
## Total_expenditure + Diphtheria + `HIV/AIDS` + GDP + Schooling
##
## Df Sum of Sq RSS AIC
## - Status 1 20.58 1551.8 341.83
## <none> 1531.2 342.08
## - GDP 1 28.71 1559.9 342.51
## - Alcohol 1 29.72 1560.9 342.60
## - Diphtheria 1 40.14 1571.4 343.47
## - Total_expenditure 1 53.10 1584.3 344.54
## - BMI 1 63.53 1594.7 345.40
## - `HIV/AIDS` 1 189.41 1720.6 355.36
## - Schooling 1 315.95 1847.2 364.65
## - Adult_Mortality 1 478.79 2010.0 375.72
##
## Step: AIC=341.83
## Life_expectancy ~ Adult_Mortality + Alcohol + BMI + Total_expenditure +
## Diphtheria + `HIV/AIDS` + GDP + Schooling
##
## Df Sum of Sq RSS AIC
## <none> 1551.8 341.83
## - Diphtheria 1 40.40 1592.2 343.19
## - Alcohol 1 41.76 1593.5 343.31
## - GDP 1 45.31 1597.1 343.60
## - BMI 1 60.72 1612.5 344.86
## - Total_expenditure 1 61.62 1613.4 344.93
## - `HIV/AIDS` 1 184.99 1736.8 354.58
## - Schooling 1 367.01 1918.8 367.64
## - Adult_Mortality 1 488.49 2040.3 375.68
summary(life_backward)
##
## Call:
## lm(formula = Life_expectancy ~ Adult_Mortality + Alcohol + BMI +
## Total_expenditure + Diphtheria + `HIV/AIDS` + GDP + Schooling,
## data = life_new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.4181 -1.7626 0.2275 1.9425 10.2316
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.706e+01 2.391e+00 23.868 < 2e-16 ***
## Adult_Mortality -2.648e-02 4.273e-03 -6.197 8.08e-09 ***
## Alcohol 1.799e-01 9.929e-02 1.812 0.072462 .
## BMI 4.049e-02 1.853e-02 2.185 0.030808 *
## Total_expenditure 2.885e-01 1.311e-01 2.201 0.029621 *
## Diphtheria>=86% Immunized 1.413e+00 7.930e-01 1.782 0.077201 .
## `HIV/AIDS` -1.018e+00 2.668e-01 -3.814 0.000216 ***
## GDP 4.677e-05 2.478e-05 1.887 0.061491 .
## Schooling 1.045e+00 1.946e-01 5.372 3.80e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.566 on 122 degrees of freedom
## Multiple R-squared: 0.8388, Adjusted R-squared: 0.8282
## F-statistic: 79.35 on 8 and 122 DF, p-value: < 2.2e-16
life_forward <- step(life_step0, scope = list(lower = life_step0, upper = life_step) ,direction = "forward")
## Start: AIC=564.92
## Life_expectancy ~ 1
##
## Df Sum of Sq RSS AIC
## + Schooling 1 6188.7 3437.8 432.03
## + Adult_Mortality 1 5708.6 3917.9 449.15
## + `HIV/AIDS` 1 3666.0 5960.5 504.12
## + BMI 1 2967.6 6658.9 518.63
## + Alcohol 1 2699.1 6927.4 523.81
## + Status 1 2453.1 7173.4 528.38
## + Polio 1 2294.8 7331.7 531.24
## + Diphtheria 1 1934.3 7692.2 537.53
## + GDP 1 1895.8 7730.7 538.19
## + `thinness__1-19_years` 1 1837.9 7788.6 539.16
## + Hepatitis_B 1 1087.6 8538.9 551.21
## + Total_expenditure 1 981.5 8645.0 552.83
## + infant_deaths 1 388.7 9237.8 561.52
## <none> 9626.5 564.92
## + Measles 1 22.9 9603.6 566.61
## + Population 1 12.5 9614.0 566.75
##
## Step: AIC=432.03
## Life_expectancy ~ Schooling
##
## Df Sum of Sq RSS AIC
## + Adult_Mortality 1 1393.17 2044.7 365.96
## + `HIV/AIDS` 1 1071.75 2366.1 385.09
## + BMI 1 167.16 3270.7 427.50
## + Polio 1 91.70 3346.1 430.49
## + GDP 1 83.87 3354.0 430.79
## + Status 1 77.22 3360.6 431.05
## + Total_expenditure 1 63.03 3374.8 431.61
## + Alcohol 1 55.28 3382.6 431.91
## <none> 3437.8 432.03
## + Diphtheria 1 35.90 3401.9 432.65
## + `thinness__1-19_years` 1 28.55 3409.3 432.94
## + infant_deaths 1 10.11 3427.7 433.64
## + Hepatitis_B 1 1.14 3436.7 433.99
## + Population 1 0.38 3437.5 434.02
## + Measles 1 0.02 3437.8 434.03
##
## Step: AIC=365.96
## Life_expectancy ~ Schooling + Adult_Mortality
##
## Df Sum of Sq RSS AIC
## + `HIV/AIDS` 1 195.968 1848.7 354.76
## + Alcohol 1 116.246 1928.4 360.29
## + Total_expenditure 1 90.566 1954.1 362.03
## + BMI 1 72.276 1972.4 363.25
## + Status 1 71.262 1973.4 363.31
## + GDP 1 60.149 1984.5 364.05
## + `thinness__1-19_years` 1 36.137 2008.5 365.63
## <none> 2044.7 365.96
## + Diphtheria 1 21.822 2022.8 366.56
## + Polio 1 20.848 2023.8 366.62
## + infant_deaths 1 2.592 2042.1 367.79
## + Hepatitis_B 1 0.963 2043.7 367.90
## + Measles 1 0.589 2044.1 367.92
## + Population 1 0.179 2044.5 367.95
##
## Step: AIC=354.76
## Life_expectancy ~ Schooling + Adult_Mortality + `HIV/AIDS`
##
## Df Sum of Sq RSS AIC
## + Alcohol 1 97.412 1751.3 349.67
## + Total_expenditure 1 90.216 1758.5 350.21
## + Status 1 79.954 1768.8 350.97
## + GDP 1 68.579 1780.1 351.81
## + BMI 1 53.390 1795.3 352.92
## + `thinness__1-19_years` 1 43.450 1805.2 353.65
## + Diphtheria 1 37.407 1811.3 354.08
## <none> 1848.7 354.76
## + Polio 1 24.960 1823.7 354.98
## + infant_deaths 1 5.061 1843.6 356.40
## + Hepatitis_B 1 4.822 1843.9 356.42
## + Measles 1 0.736 1848.0 356.71
## + Population 1 0.556 1848.2 356.72
##
## Step: AIC=349.67
## Life_expectancy ~ Schooling + Adult_Mortality + `HIV/AIDS` +
## Alcohol
##
## Df Sum of Sq RSS AIC
## + Total_expenditure 1 70.273 1681.0 346.31
## + BMI 1 44.275 1707.0 348.32
## + Status 1 42.635 1708.7 348.44
## + Diphtheria 1 41.585 1709.7 348.52
## + GDP 1 34.867 1716.4 349.04
## + `thinness__1-19_years` 1 29.835 1721.5 349.42
## <none> 1751.3 349.67
## + Polio 1 21.939 1729.3 350.02
## + Hepatitis_B 1 11.233 1740.1 350.83
## + infant_deaths 1 6.459 1744.8 351.19
## + Measles 1 2.926 1748.4 351.45
## + Population 1 1.422 1749.9 351.56
##
## Step: AIC=346.31
## Life_expectancy ~ Schooling + Adult_Mortality + `HIV/AIDS` +
## Alcohol + Total_expenditure
##
## Df Sum of Sq RSS AIC
## + BMI 1 45.718 1635.3 344.69
## + GDP 1 34.194 1646.8 345.61
## + Diphtheria 1 32.955 1648.1 345.71
## + Status 1 31.648 1649.4 345.82
## <none> 1681.0 346.31
## + `thinness__1-19_years` 1 24.237 1656.8 346.40
## + Polio 1 13.696 1667.3 347.23
## + Hepatitis_B 1 9.307 1671.7 347.58
## + infant_deaths 1 2.881 1678.1 348.08
## + Measles 1 0.812 1680.2 348.24
## + Population 1 0.330 1680.7 348.28
##
## Step: AIC=344.69
## Life_expectancy ~ Schooling + Adult_Mortality + `HIV/AIDS` +
## Alcohol + Total_expenditure + BMI
##
## Df Sum of Sq RSS AIC
## + GDP 1 43.107 1592.2 343.19
## + Diphtheria 1 38.201 1597.1 343.60
## + Status 1 36.996 1598.3 343.70
## <none> 1635.3 344.69
## + Polio 1 18.619 1616.7 345.19
## + Hepatitis_B 1 15.186 1620.1 345.47
## + `thinness__1-19_years` 1 6.730 1628.6 346.15
## + infant_deaths 1 0.657 1634.6 346.64
## + Population 1 0.069 1635.2 346.69
## + Measles 1 0.020 1635.3 346.69
##
## Step: AIC=343.19
## Life_expectancy ~ Schooling + Adult_Mortality + `HIV/AIDS` +
## Alcohol + Total_expenditure + BMI + GDP
##
## Df Sum of Sq RSS AIC
## + Diphtheria 1 40.401 1551.8 341.83
## <none> 1592.2 343.19
## + Status 1 20.838 1571.4 343.47
## + Polio 1 19.900 1572.3 343.55
## + Hepatitis_B 1 14.647 1577.5 343.98
## + `thinness__1-19_years` 1 4.249 1587.9 344.84
## + infant_deaths 1 0.438 1591.8 345.16
## + Measles 1 0.307 1591.9 345.17
## + Population 1 0.296 1591.9 345.17
##
## Step: AIC=341.83
## Life_expectancy ~ Schooling + Adult_Mortality + `HIV/AIDS` +
## Alcohol + Total_expenditure + BMI + GDP + Diphtheria
##
## Df Sum of Sq RSS AIC
## <none> 1551.8 341.83
## + Status 1 20.5808 1531.2 342.08
## + Hepatitis_B 1 20.2143 1531.6 342.11
## + `thinness__1-19_years` 1 7.4618 1544.3 343.20
## + Polio 1 3.3691 1548.4 343.54
## + Measles 1 3.0547 1548.7 343.57
## + Population 1 2.9088 1548.9 343.58
## + infant_deaths 1 0.4607 1551.3 343.79
summary(life_forward)
##
## Call:
## lm(formula = Life_expectancy ~ Schooling + Adult_Mortality +
## `HIV/AIDS` + Alcohol + Total_expenditure + BMI + GDP + Diphtheria,
## data = life_new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.4181 -1.7626 0.2275 1.9425 10.2316
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.706e+01 2.391e+00 23.868 < 2e-16 ***
## Schooling 1.045e+00 1.946e-01 5.372 3.80e-07 ***
## Adult_Mortality -2.648e-02 4.273e-03 -6.197 8.08e-09 ***
## `HIV/AIDS` -1.018e+00 2.668e-01 -3.814 0.000216 ***
## Alcohol 1.799e-01 9.929e-02 1.812 0.072462 .
## Total_expenditure 2.885e-01 1.311e-01 2.201 0.029621 *
## BMI 4.049e-02 1.853e-02 2.185 0.030808 *
## GDP 4.677e-05 2.478e-05 1.887 0.061491 .
## Diphtheria>=86% Immunized 1.413e+00 7.930e-01 1.782 0.077201 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.566 on 122 degrees of freedom
## Multiple R-squared: 0.8388, Adjusted R-squared: 0.8282
## F-statistic: 79.35 on 8 and 122 DF, p-value: < 2.2e-16
life_both <- step(life_step, scope = list(lower = life_step0, upper = life_step) ,direction = "both")
## Start: AIC=350.18
## Life_expectancy ~ Status + Adult_Mortality + infant_deaths +
## Alcohol + Hepatitis_B + Measles + BMI + Polio + Total_expenditure +
## Diphtheria + `HIV/AIDS` + GDP + Population + `thinness__1-19_years` +
## Schooling
##
## Df Sum of Sq RSS AIC
## - Measles 1 1.48 1487.8 348.31
## - Polio 1 3.13 1489.5 348.46
## - infant_deaths 1 3.46 1489.8 348.49
## - Population 1 8.43 1494.8 348.92
## - `thinness__1-19_years` 1 11.75 1498.1 349.21
## - Alcohol 1 15.94 1502.3 349.58
## - Hepatitis_B 1 17.62 1504.0 349.73
## - Status 1 22.01 1508.4 350.11
## <none> 1486.3 350.18
## - GDP 1 31.00 1517.3 350.89
## - BMI 1 31.67 1518.0 350.95
## - Diphtheria 1 44.18 1530.5 352.02
## - Total_expenditure 1 45.26 1531.6 352.11
## - `HIV/AIDS` 1 189.47 1675.8 363.90
## - Schooling 1 260.51 1746.9 369.34
## - Adult_Mortality 1 461.20 1947.5 383.59
##
## Step: AIC=348.31
## Life_expectancy ~ Status + Adult_Mortality + infant_deaths +
## Alcohol + Hepatitis_B + BMI + Polio + Total_expenditure +
## Diphtheria + `HIV/AIDS` + GDP + Population + `thinness__1-19_years` +
## Schooling
##
## Df Sum of Sq RSS AIC
## - infant_deaths 1 2.43 1490.2 346.53
## - Polio 1 3.13 1491.0 346.59
## - Population 1 10.98 1498.8 347.28
## - `thinness__1-19_years` 1 13.58 1501.4 347.50
## - Hepatitis_B 1 17.16 1505.0 347.82
## - Alcohol 1 17.22 1505.0 347.82
## - Status 1 21.26 1509.1 348.17
## <none> 1487.8 348.31
## - BMI 1 30.35 1518.2 348.96
## - GDP 1 30.38 1518.2 348.96
## - Diphtheria 1 43.59 1531.4 350.10
## + Measles 1 1.48 1486.3 350.18
## - Total_expenditure 1 44.71 1532.5 350.19
## - `HIV/AIDS` 1 195.04 1682.9 362.45
## - Schooling 1 261.98 1749.8 367.56
## - Adult_Mortality 1 460.12 1948.0 381.61
##
## Step: AIC=346.53
## Life_expectancy ~ Status + Adult_Mortality + Alcohol + Hepatitis_B +
## BMI + Polio + Total_expenditure + Diphtheria + `HIV/AIDS` +
## GDP + Population + `thinness__1-19_years` + Schooling
##
## Df Sum of Sq RSS AIC
## - Polio 1 3.40 1493.7 344.83
## - Population 1 12.10 1502.4 345.59
## - `thinness__1-19_years` 1 14.44 1504.7 345.79
## - Hepatitis_B 1 17.05 1507.3 346.02
## - Alcohol 1 17.62 1507.9 346.07
## - Status 1 20.18 1510.4 346.29
## <none> 1490.2 346.53
## - GDP 1 30.29 1520.5 347.16
## - BMI 1 30.80 1521.0 347.21
## + infant_deaths 1 2.43 1487.8 348.31
## - Diphtheria 1 45.10 1535.3 348.43
## + Measles 1 0.45 1489.8 348.49
## - Total_expenditure 1 46.63 1536.9 348.56
## - `HIV/AIDS` 1 198.27 1688.5 360.89
## - Schooling 1 268.21 1758.5 366.21
## - Adult_Mortality 1 464.59 1954.8 380.08
##
## Step: AIC=344.83
## Life_expectancy ~ Status + Adult_Mortality + Alcohol + Hepatitis_B +
## BMI + Total_expenditure + Diphtheria + `HIV/AIDS` + GDP +
## Population + `thinness__1-19_years` + Schooling
##
## Df Sum of Sq RSS AIC
## - Population 1 12.68 1506.3 343.93
## - `thinness__1-19_years` 1 15.26 1508.9 344.16
## - Alcohol 1 16.31 1510.0 344.25
## - Hepatitis_B 1 16.77 1510.4 344.29
## - Status 1 19.58 1513.2 344.53
## <none> 1493.7 344.83
## - GDP 1 30.39 1524.0 345.46
## - BMI 1 31.78 1525.4 345.58
## + Polio 1 3.40 1490.2 346.53
## + infant_deaths 1 2.69 1491.0 346.59
## - Total_expenditure 1 44.86 1538.5 346.70
## + Measles 1 0.42 1493.2 346.79
## - Diphtheria 1 47.92 1541.6 346.96
## - `HIV/AIDS` 1 195.02 1688.7 358.90
## - Schooling 1 268.41 1762.1 364.47
## - Adult_Mortality 1 461.76 1955.4 378.11
##
## Step: AIC=343.93
## Life_expectancy ~ Status + Adult_Mortality + Alcohol + Hepatitis_B +
## BMI + Total_expenditure + Diphtheria + `HIV/AIDS` + GDP +
## `thinness__1-19_years` + Schooling
##
## Df Sum of Sq RSS AIC
## - `thinness__1-19_years` 1 5.83 1512.2 342.44
## - Status 1 18.44 1524.8 343.53
## - Hepatitis_B 1 18.48 1524.8 343.53
## - Alcohol 1 19.84 1526.2 343.65
## <none> 1506.3 343.93
## - GDP 1 30.62 1537.0 344.57
## + Population 1 12.68 1493.7 344.83
## - BMI 1 34.43 1540.8 344.89
## + Measles 1 7.75 1498.6 345.26
## + Polio 1 3.97 1502.4 345.59
## + infant_deaths 1 3.61 1502.7 345.62
## - Total_expenditure 1 43.85 1550.2 345.69
## - Diphtheria 1 44.88 1551.2 345.78
## - `HIV/AIDS` 1 198.24 1704.6 358.13
## - Schooling 1 302.29 1808.6 365.89
## - Adult_Mortality 1 459.48 1965.8 376.81
##
## Step: AIC=342.44
## Life_expectancy ~ Status + Adult_Mortality + Alcohol + Hepatitis_B +
## BMI + Total_expenditure + Diphtheria + `HIV/AIDS` + GDP +
## Schooling
##
## Df Sum of Sq RSS AIC
## - Hepatitis_B 1 19.05 1531.2 342.08
## - Status 1 19.41 1531.6 342.11
## - Alcohol 1 20.94 1533.1 342.24
## <none> 1512.2 342.44
## - GDP 1 32.62 1544.8 343.23
## + `thinness__1-19_years` 1 5.83 1506.3 343.93
## + Measles 1 4.34 1507.8 344.06
## + Polio 1 4.34 1507.8 344.06
## + Population 1 3.26 1508.9 344.16
## - Diphtheria 1 44.24 1556.4 344.22
## + infant_deaths 1 0.29 1511.9 344.41
## - Total_expenditure 1 46.59 1558.8 344.41
## - BMI 1 52.90 1565.1 344.94
## - `HIV/AIDS` 1 193.78 1705.9 356.23
## - Schooling 1 326.72 1838.9 366.06
## - Adult_Mortality 1 459.47 1971.6 375.20
##
## Step: AIC=342.08
## Life_expectancy ~ Status + Adult_Mortality + Alcohol + BMI +
## Total_expenditure + Diphtheria + `HIV/AIDS` + GDP + Schooling
##
## Df Sum of Sq RSS AIC
## - Status 1 20.58 1551.8 341.83
## <none> 1531.2 342.08
## + Hepatitis_B 1 19.05 1512.2 342.44
## - GDP 1 28.71 1559.9 342.51
## - Alcohol 1 29.72 1560.9 342.60
## - Diphtheria 1 40.14 1571.4 343.47
## + `thinness__1-19_years` 1 6.40 1524.8 343.53
## + Measles 1 4.26 1527.0 343.71
## + Polio 1 4.06 1527.2 343.73
## + Population 1 3.81 1527.4 343.75
## + infant_deaths 1 0.46 1530.8 344.04
## - Total_expenditure 1 53.10 1584.3 344.54
## - BMI 1 63.53 1594.7 345.40
## - `HIV/AIDS` 1 189.41 1720.6 355.36
## - Schooling 1 315.95 1847.2 364.65
## - Adult_Mortality 1 478.79 2010.0 375.72
##
## Step: AIC=341.83
## Life_expectancy ~ Adult_Mortality + Alcohol + BMI + Total_expenditure +
## Diphtheria + `HIV/AIDS` + GDP + Schooling
##
## Df Sum of Sq RSS AIC
## <none> 1551.8 341.83
## + Status 1 20.58 1531.2 342.08
## + Hepatitis_B 1 20.21 1531.6 342.11
## - Diphtheria 1 40.40 1592.2 343.19
## + `thinness__1-19_years` 1 7.46 1544.3 343.20
## - Alcohol 1 41.76 1593.5 343.31
## + Polio 1 3.37 1548.4 343.54
## + Measles 1 3.05 1548.7 343.57
## + Population 1 2.91 1548.9 343.58
## - GDP 1 45.31 1597.1 343.60
## + infant_deaths 1 0.46 1551.3 343.79
## - BMI 1 60.72 1612.5 344.86
## - Total_expenditure 1 61.62 1613.4 344.93
## - `HIV/AIDS` 1 184.99 1736.8 354.58
## - Schooling 1 367.01 1918.8 367.64
## - Adult_Mortality 1 488.49 2040.3 375.68
summary(life_both)
##
## Call:
## lm(formula = Life_expectancy ~ Adult_Mortality + Alcohol + BMI +
## Total_expenditure + Diphtheria + `HIV/AIDS` + GDP + Schooling,
## data = life_new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.4181 -1.7626 0.2275 1.9425 10.2316
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.706e+01 2.391e+00 23.868 < 2e-16 ***
## Adult_Mortality -2.648e-02 4.273e-03 -6.197 8.08e-09 ***
## Alcohol 1.799e-01 9.929e-02 1.812 0.072462 .
## BMI 4.049e-02 1.853e-02 2.185 0.030808 *
## Total_expenditure 2.885e-01 1.311e-01 2.201 0.029621 *
## Diphtheria>=86% Immunized 1.413e+00 7.930e-01 1.782 0.077201 .
## `HIV/AIDS` -1.018e+00 2.668e-01 -3.814 0.000216 ***
## GDP 4.677e-05 2.478e-05 1.887 0.061491 .
## Schooling 1.045e+00 1.946e-01 5.372 3.80e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.566 on 122 degrees of freedom
## Multiple R-squared: 0.8388, Adjusted R-squared: 0.8282
## F-statistic: 79.35 on 8 and 122 DF, p-value: < 2.2e-16
According to each feature selection, the 3 variables that have the most siginificance with regards to Life_expectancy are Adult_Mortality, HIV/AIDS, and Schooling. They have the lowest p-values in each of the selection methods as well as chosen by the model with ’***’ next to them.
Let us create a model with just these variables.
life_highp <- lm(formula = Life_expectancy ~ Adult_Mortality + `HIV/AIDS` + Schooling, data = life_new)
summary(life_highp)
##
## Call:
## lm(formula = Life_expectancy ~ Adult_Mortality + `HIV/AIDS` +
## Schooling, data = life_new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.4261 -2.1214 0.2122 2.2014 9.4356
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 54.506514 2.298683 23.712 < 2e-16 ***
## Adult_Mortality -0.026817 0.004498 -5.962 2.31e-08 ***
## `HIV/AIDS` -1.035604 0.282249 -3.669 0.000357 ***
## Schooling 1.668685 0.148224 11.258 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.815 on 127 degrees of freedom
## Multiple R-squared: 0.808, Adjusted R-squared: 0.8034
## F-statistic: 178.1 on 3 and 127 DF, p-value: < 2.2e-16
data.frame(model = c("life_backward","life_forward","life_both", "life_highp"),
AdjRsquare = c(summary(life_backward)$adj.r.square,
summary(life_forward)$adj.r.square,
summary(life_both)$adj.r.square,
summary(life_highp)$adj.r.square))
## model AdjRsquare
## 1 life_backward 0.8282293
## 2 life_forward 0.8282293
## 3 life_both 0.8282293
## 4 life_highp 0.8034202
Given the Adj R-Squared Value for forward, backward, and both direction models are the same, we can choose whichever model we want. We will select Backwards Selection Model.
life_pred <- predict(life_backward, life_new)
data.frame(Method = c("MSE","RMSE","MAE", "MAPE"),
Error = c(MSE(life_pred, life_new$Life_expectancy),
RMSE(life_pred, life_new$Life_expectancy),
MAE(life_pred, life_new$Life_expectancy),
MAPE(life_pred, life_new$Life_expectancy)))
## Method Error
## 1 MSE 11.8457385
## 2 RMSE 3.4417639
## 3 MAE 2.5772651
## 4 MAPE 0.0375666
range(life_new$Life_expectancy)
## [1] 48.1 89.0
The error values are quite small compared to the range of values for Life_expectancy. This means we can expect the model’s predicted values to be close to the actual values.
Let us check assumptions of the Model to see if it passes:
hist(life_backward$residuals, breaks = 20)
The histogram looks fairly normally distributed with the higher frequency residuals more towards the center.
plot(life_backward, which = 2)
The QQ plot looks fairly linear as the the majority of the values fall along the line of best fit.
shapiro.test(life_backward$residuals)
##
## Shapiro-Wilk normality test
##
## data: life_backward$residuals
## W = 0.97649, p-value = 0.02246
Our Shapiro-Wilk Test shows that we do not pass normality of residuals
plot(cooks.distance(life_backward))
Cook’s D also tells us we have outliers present in the data. We will try taking the Log of the numerical values within the model to see if this sovles our
plot(life_backward$residuals)